import requests
import re
from jsonpath import jsonpath
import json

index_url = 'https://news.163.com/special/cm_yaowen20200213/?callback=data_callback'
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
    'Referer':'https://news.163.com/',
    'Cookie':'_ntes_nnid=0ede1436cf5bb9fd1d48c9fa7abaa2e6,1675945494282; _ntes_nuid=0ede1436cf5bb9fd1d48c9fa7abaa2e6; __bid_n=18770bbd6466569efd4207; FPTOKEN=xOHXivsMMRK6KirXr4Af/hwBcgdmA98UHq8RvKV4tH9xe3J6flvDVyoYGxJ53KPfqzwCDUmWB8faKDR3H3OuKMGanVInTzyEwGkyKn/FklPIRdQzoUyM/7Mxpwg4MTYNq5R1MxF6cR0FQPGCqgfdxK+hL08IAwVjp2ZEiE/0lxpD062qZ6J40vsKP0PsZNHgnbnI17CERFWuiVkX0A6ZAvJdNT0K8pC9WGIdUn0/5563jOgvLQjppLDxXyeBUC8Q5rySVZzfIyXkQemjH5eK++kFAq+H36uZ06D0ZEZsZlFrDWheJ8n7JfQURT7tcxaIe/f5yBuYA8OjeOxjqd3OAAThqODubZ2rfRSPzCKecziYVsA8YtaAMo0J5tpO4bHwWCZ6oFvOohalOGa+aeSP3A==|uvIi6SiKikYQO7M4ycfLcZNhq+pTxvICk9N3S0qSdG0=|10|690a48c9936218423a0d3ccce22a8fe0; s_n_f_l_n3=343c7f066a1bc4f01686830608216; _antanalysis_s_id=1686830608660; BAIDU_SSP_lcr=https://www.baidu.com/link?url=PIGix82PsazBwAOLJYCJuTyna1rh2Cw2yULnxhDo14G&wd=&eqid=c7f5390d0001d7eb00000006648afe0c; ne_analysis_trace_id=1686830695228; vinfo_n_f_l_n3=343c7f066a1bc4f0.1.9.1681223565280.1686751275919.1686830695232'
}


def get_data(url):
    response = requests.get(url, headers=headers)
    # print(response.status_code)
    # print(response.text)
    return response.text

def parse_data(data):
    # print(data)
    # 一。正则提取
    result_data = re.search('data_callback\((.*?)\)', data, re.S)
    result = result_data.group(1)
    # 二。直接用字符串切割
    # print(data.split('data_callback(')[1].split(')')[0])
    # 将字符串转成list
    json_data = json.loads(result)
    titles = jsonpath(json_data, '$..title')
    tlinks = jsonpath(json_data, '$..tlink')
    for title, tlink in zip(titles, tlinks):
        print(title)
        print(tlink)
        print('========')


if __name__ == '__main__':
    data = get_data(index_url)
    parse_data(data)
    for i in range(2, 6):
        print('正在爬取第{}页。'.format(i))
        page_url = 'https://news.163.com/special/cm_yaowen20200213_0{}/?callback=data_callback'.format(i)
        data = get_data(page_url)
        parse_data(data)